library(tidyverse)
library(ggplot2)
library(lavaan)
library(car)
library(glmnet)
library(randomForestSRC)AAQoL machine learning analysis with unbalanced random forest
Data set
This data set is from the 2015 Asian American Quality of Life survey. Participants are from Austin, Texas.
Input data set
qol <- read_csv("AAQoL.csv") |> mutate(across(where(is.character), ~as.factor(.x))) |>
mutate(`English Difficulties`=relevel(`English Difficulties`,ref="Not at all"),
`English Speaking`=relevel(`English Speaking`,ref="Not at all"),
Ethnicity = relevel(Ethnicity,ref="Chinese")) |>
mutate(Income_median = case_match(Income,"$0 - $9,999"~"Below",
"$10,000 - $19,999" ~"Below",
"$20,000 - $29,999"~"Below",
"$30,000 - $39,999"~"Below",
"$40,000 - $49,999"~"Below",
"$50,000 - $59,999"~"Below",
"$60,000 - $69,999"~"Above",
"$70,000 and over"~"Above",
.default=Income)) |>
mutate(Income_median = factor(Income_median, levels=c("Below","Above")))New names:
Rows: 2609 Columns: 231
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(190): Gender, Ethnicity, Marital Status, No One, Spouse, Children, Gran... dbl
(41): Survey ID, Age, Education Completed, Household Size, Grandparent,...
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `Other` -> `Other...17`
• `Other` -> `Other...89`
qol |> DT::datatable()Warning in instance$preRenderHook(instance): It seems your data is too big for
client-side DataTables. You may consider server-side processing:
https://rstudio.github.io/DT/server.html
Source of Information: Family
ps(Family)# A tibble: 4 × 3
Family n pct
<fct> <int> <dbl>
1 3 1 0.0383
2 No 1258 48.2
3 Yes 1331 51.0
4 <NA> 19 0.728
rfdata <- qol |> filter(Family %in% c("No","Yes")) |>
mutate(Family=droplevels(Family)) |>
select(Family, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`) |>
as.data.frame()
# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(Family ~ .,importance=T,data=rfdata,
perf.type = "gmean",splitrule="gini")
print(rfobj) Sample size: 2187
Frequency of class labels: 1069, 1118
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 481.8583
No. of variables tried at each split: 6
Total no. of variables: 31
Resampling used to grow trees: swor
Resample size used to grow trees: 1382
Analysis: RFQ
Family: class
Splitting rule: gini *random*
Number of random split points: 10
Imbalanced ratio: 1.0458
(OOB) Brier score: 0.23103992
(OOB) Normalized Brier score: 0.92415967
(OOB) AUC: 0.65240365
(OOB) PR-AUC: 0.61641433
(OOB) G-mean: 0.60585351
(OOB) Requested performance error: 0.39414649
Confusion matrix:
predicted
observed No Yes class.error
No 711 358 0.3349
Yes 501 617 0.4481
(OOB) Misclassification rate: 0.3927755
print(rfobj) Sample size: 2187
Frequency of class labels: 1069, 1118
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 481.8583
No. of variables tried at each split: 6
Total no. of variables: 31
Resampling used to grow trees: swor
Resample size used to grow trees: 1382
Analysis: RFQ
Family: class
Splitting rule: gini *random*
Number of random split points: 10
Imbalanced ratio: 1.0458
(OOB) Brier score: 0.23103992
(OOB) Normalized Brier score: 0.92415967
(OOB) AUC: 0.65240365
(OOB) PR-AUC: 0.61641433
(OOB) G-mean: 0.60585351
(OOB) Requested performance error: 0.39414649
Confusion matrix:
predicted
observed No Yes class.error
No 711 358 0.3349
Yes 501 617 0.4481
(OOB) Misclassification rate: 0.3927755
plot(rfobj,plots.one.page = FALSE)

all No Yes
Age 0.0293 NA NA
Ethnicity 0.0262 NA NA
Similar Values 0.0073 NA NA
Close-knit Community 0.0061 NA NA
Spend Time Together 0.0057 NA NA
Religion 0.0045 NA NA
Community Shares Values 0.0039 NA NA
Close Friends 0.0036 NA NA
Helpful Community 0.0029 NA NA
EnglishDiff 0.0026 NA NA
Community Trust 0.0025 NA NA
Togetherness 0.0020 NA NA
Trust 0.0018 NA NA
Get Along 0.0017 NA NA
Religious Importance 0.0013 NA NA
Family Pride 0.0011 NA NA
EnglishSpeak 0.0010 NA NA
Close Family -0.0005 NA NA
See Friends -0.0005 NA NA
Loyalty -0.0009 NA NA
Gender -0.0013 NA NA
See Family -0.0018 NA NA
Family Respect -0.0018 NA NA
Expression -0.0020 NA NA
Feel Close -0.0020 NA NA
Helpful Family -0.0021 NA NA
rfobj$importance all No Yes
Ethnicity 0.0262227715 NA NA
Age 0.0293419950 NA NA
Gender -0.0013429744 NA NA
Religion 0.0045072860 NA NA
Employment -0.0053062773 NA NA
Income_median -0.0035362445 NA NA
EnglishSpeak 0.0009702818 NA NA
EnglishDiff 0.0026345779 NA NA
See Family -0.0017686292 NA NA
Close Family -0.0004748989 NA NA
Helpful Family -0.0021238020 NA NA
See Friends -0.0005031878 NA NA
Close Friends 0.0036216084 NA NA
Helpful Friends -0.0037318723 NA NA
Family Respect -0.0018223314 NA NA
Similar Values 0.0072965800 NA NA
Successful Family -0.0032018158 NA NA
Trust 0.0017544161 NA NA
Loyalty -0.0008508267 NA NA
Family Pride 0.0010968648 NA NA
Expression -0.0019606961 NA NA
Spend Time Together 0.0056799988 NA NA
Feel Close -0.0020219520 NA NA
Togetherness 0.0020134801 NA NA
Religious Attendance -0.0031825823 NA NA
Religious Importance 0.0013431897 NA NA
Close-knit Community 0.0060607174 NA NA
Helpful Community 0.0028718174 NA NA
Community Shares Values 0.0039405524 NA NA
Get Along 0.0017066330 NA NA
Community Trust 0.0025014317 NA NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "#F8766D") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_bw()
plot(importance_plot)
Source of Information: Health Professionals
ps(`Heal Professionals`)# A tibble: 3 × 3
`Heal Professionals` n pct
<fct> <int> <dbl>
1 No 1326 50.8
2 Yes 1264 48.4
3 <NA> 19 0.728
rfdata <- qol |>
select(`Heal Professionals`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`) |>
as.data.frame()
imbalanced(`Heal Professionals` ~ .,importance=T,data=rfdata,
perf.type = "gmean",splitrule="gini")->rfobj
print(rfobj) Sample size: 2188
Frequency of class labels: 1067, 1121
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 482.3017
No. of variables tried at each split: 6
Total no. of variables: 31
Resampling used to grow trees: swor
Resample size used to grow trees: 1383
Analysis: RFQ
Family: class
Splitting rule: gini *random*
Number of random split points: 10
Imbalanced ratio: 1.0506
(OOB) Brier score: 0.2315831
(OOB) Normalized Brier score: 0.92633239
(OOB) AUC: 0.65696526
(OOB) PR-AUC: 0.62672582
(OOB) G-mean: 0.61389953
(OOB) Requested performance error: 0.38610047
Confusion matrix:
predicted
observed No Yes class.error
No 660 407 0.3814
Yes 438 683 0.3907
(OOB) Misclassification rate: 0.3861974
plot(rfobj,plots.one.page = FALSE)

all No Yes
EnglishSpeak 0.0238 NA NA
Close Friends 0.0091 NA NA
Religious Attendance 0.0063 NA NA
Helpful Community 0.0061 NA NA
See Friends 0.0049 NA NA
Income_median 0.0040 NA NA
Close Family 0.0023 NA NA
Community Trust 0.0018 NA NA
See Family 0.0013 NA NA
Community Shares Values 0.0010 NA NA
Get Along 0.0006 NA NA
Employment 0.0004 NA NA
Similar Values 0.0000 NA NA
EnglishDiff -0.0001 NA NA
Feel Close -0.0004 NA NA
Spend Time Together -0.0005 NA NA
Trust -0.0005 NA NA
Family Respect -0.0014 NA NA
Age -0.0014 NA NA
Gender -0.0014 NA NA
Expression -0.0018 NA NA
Loyalty -0.0028 NA NA
Togetherness -0.0033 NA NA
Close-knit Community -0.0037 NA NA
Helpful Friends -0.0037 NA NA
Religious Importance -0.0037 NA NA
rfobj$importance all No Yes
Ethnicity -4.967040e-03 NA NA
Age -1.406553e-03 NA NA
Gender -1.418102e-03 NA NA
Religion -4.031489e-03 NA NA
Employment 4.223221e-04 NA NA
Income_median 4.030721e-03 NA NA
EnglishSpeak 2.379387e-02 NA NA
EnglishDiff -7.625946e-05 NA NA
See Family 1.336760e-03 NA NA
Close Family 2.329798e-03 NA NA
Helpful Family -6.385242e-03 NA NA
See Friends 4.939594e-03 NA NA
Close Friends 9.055062e-03 NA NA
Helpful Friends -3.743238e-03 NA NA
Family Respect -1.393644e-03 NA NA
Similar Values 1.634255e-05 NA NA
Successful Family -4.137012e-03 NA NA
Trust -4.648994e-04 NA NA
Loyalty -2.784139e-03 NA NA
Family Pride -6.775320e-03 NA NA
Expression -1.843914e-03 NA NA
Spend Time Together -4.648994e-04 NA NA
Feel Close -3.730365e-04 NA NA
Togetherness -3.272003e-03 NA NA
Religious Attendance 6.310613e-03 NA NA
Religious Importance -3.747298e-03 NA NA
Close-knit Community -3.657954e-03 NA NA
Helpful Community 6.076049e-03 NA NA
Community Shares Values 9.881442e-04 NA NA
Get Along 5.879244e-04 NA NA
Community Trust 1.845371e-03 NA NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "#F8766D") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_bw()
plot(importance_plot)
Health Insurance
ps(`Health Insurance`)# A tibble: 3 × 3
`Health Insurance` n pct
<fct> <int> <dbl>
1 0 381 14.6
2 Yes 2207 84.6
3 <NA> 21 0.805
Random Forest (randomForestSRC)
#install.packages("randomForestSRC)
rfdata <- qol |>
select(`Health Insurance`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`) |>
as.data.frame()
imb <- imbalanced(`Health Insurance` ~ .,importance=T,data=rfdata,
perf.type = "gmean",splitrule="gini")
print(imb) Sample size: 2189
Frequency of class labels: 292, 1897
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 258.6077
No. of variables tried at each split: 6
Total no. of variables: 31
Resampling used to grow trees: swor
Resample size used to grow trees: 1383
Analysis: RFQ
Family: class
Splitting rule: gini *random*
Number of random split points: 10
Imbalanced ratio: 6.4966
(OOB) Brier score: 0.10366262
(OOB) Normalized Brier score: 0.41465048
(OOB) AUC: 0.74897639
(OOB) PR-AUC: 0.33153441
(OOB) G-mean: 0.66637896
(OOB) Requested performance error: 0.33362104
Confusion matrix:
predicted
observed 0 Yes class.error
0 222 70 0.2397
Yes 789 1108 0.4159
(OOB) Misclassification rate: 0.3924166
get.imbalanced.performance(imb) n.majority n.minority iratio threshold sens spec
1897.0000000 292.0000000 6.4965753 0.1333942 0.7602740 0.5840801
prec npv misclass brier brier.norm auc
0.2195846 0.9405772 0.3924166 0.1036626 0.4146505 0.7489764
F1 F1mod pr.auc.rand pr.auc F1gmean F1modgmean
0.3407521 0.4627145 0.1333942 0.3315344 0.5035655 0.5645467
gmean
0.6663790
var_importance <- imb$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
# Create ggplot for variable importance
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "#F8766D") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_minimal()
plot(importance_plot)
Dental Insurance
ps(`Dental Insurance`)# A tibble: 3 × 3
`Dental Insurance` n pct
<fct> <int> <dbl>
1 0 1050 40.2
2 Yes 1529 58.6
3 <NA> 30 1.15
Random Forest (randomForestSRC)
#install.packages("randomForestSRC)
rfdata <- qol |>
select(`Dental Insurance`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`) |>
as.data.frame()
imb <- imbalanced(`Dental Insurance` ~ .,importance=T,data=rfdata,
perf.type = "gmean",splitrule="gini")
print(imb) Sample size: 2184
Frequency of class labels: 849, 1335
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 397.4423
No. of variables tried at each split: 6
Total no. of variables: 31
Resampling used to grow trees: swor
Resample size used to grow trees: 1380
Analysis: RFQ
Family: class
Splitting rule: gini *random*
Number of random split points: 10
Imbalanced ratio: 1.5724
(OOB) Brier score: 0.17748606
(OOB) Normalized Brier score: 0.70994423
(OOB) AUC: 0.79696757
(OOB) PR-AUC: 0.70071511
(OOB) G-mean: 0.73335655
(OOB) Requested performance error: 0.26664345
Confusion matrix:
predicted
observed 0 Yes class.error
0 643 206 0.2426
Yes 387 948 0.2899
(OOB) Misclassification rate: 0.2715201
get.imbalanced.performance(imb) n.majority n.minority iratio threshold sens spec
1335.0000000 849.0000000 1.5724382 0.3887363 0.7573616 0.7101124
prec npv misclass brier brier.norm auc
0.6242718 0.8214905 0.2715201 0.1774861 0.7099442 0.7969676
F1 F1mod pr.auc.rand pr.auc F1gmean F1modgmean
0.6844066 0.7210108 0.3887363 0.7007151 0.7088816 0.7271837
gmean
0.7333566
var_importance <- imb$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
# Create ggplot for variable importance
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "#F8766D") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_minimal()
plot(importance_plot)
Physical Checkup
ps(`Physical Check-up`)# A tibble: 3 × 3
`Physical Check-up` n pct
<fct> <int> <dbl>
1 0 833 31.9
2 Yes 1740 66.7
3 <NA> 36 1.38
Random Forest (randomForestSRC)
#install.packages("randomForestSRC)
rfdata <- qol |>
select(`Physical Check-up`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`) |>
as.data.frame()
imb <- imbalanced(`Physical Check-up` ~ .,importance=T,data=rfdata,
perf.type = "gmean",splitrule="gini")
print(imb) Sample size: 2178
Frequency of class labels: 704, 1474
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 425.4703
No. of variables tried at each split: 6
Total no. of variables: 31
Resampling used to grow trees: swor
Resample size used to grow trees: 1376
Analysis: RFQ
Family: class
Splitting rule: gini *random*
Number of random split points: 10
Imbalanced ratio: 2.0938
(OOB) Brier score: 0.19880312
(OOB) Normalized Brier score: 0.7952125
(OOB) AUC: 0.68752313
(OOB) PR-AUC: 0.49471621
(OOB) G-mean: 0.63669065
(OOB) Requested performance error: 0.36330935
Confusion matrix:
predicted
observed 0 Yes class.error
0 488 216 0.3068
Yes 612 862 0.4152
(OOB) Misclassification rate: 0.3801653
plot(imb,plots.one.page = F)

all 0 Yes
Age 0.0573 NA NA
Income_median 0.0223 NA NA
Ethnicity 0.0103 NA NA
Employment 0.0085 NA NA
Togetherness 0.0020 NA NA
Community Trust 0.0012 NA NA
Helpful Friends 0.0007 NA NA
Close Family 0.0007 NA NA
Religion 0.0003 NA NA
Feel Close -0.0005 NA NA
Expression -0.0011 NA NA
See Family -0.0012 NA NA
Similar Values -0.0016 NA NA
EnglishSpeak -0.0017 NA NA
Religious Importance -0.0031 NA NA
Loyalty -0.0031 NA NA
Gender -0.0038 NA NA
Helpful Family -0.0041 NA NA
EnglishDiff -0.0050 NA NA
Community Shares Values -0.0051 NA NA
Helpful Community -0.0055 NA NA
Family Pride -0.0056 NA NA
Spend Time Together -0.0059 NA NA
See Friends -0.0059 NA NA
Close-knit Community -0.0062 NA NA
Family Respect -0.0068 NA NA
get.imbalanced.performance(imb) n.majority n.minority iratio threshold sens spec
1474.0000000 704.0000000 2.0937500 0.3232323 0.6931818 0.5848033
prec npv misclass brier brier.norm auc
0.4436364 0.7996289 0.3801653 0.1988031 0.7952125 0.6875231
F1 F1mod pr.auc.rand pr.auc F1gmean F1modgmean
0.5410200 0.6008462 0.3232323 0.4947162 0.5888553 0.6187684
gmean
0.6366907
var_importance <- imb$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
# Create ggplot for variable importance
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "#F8766D") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_minimal()
plot(importance_plot)
Dental Checkup
ps(`Dentist Check-up`)# A tibble: 3 × 3
`Dentist Check-up` n pct
<fct> <int> <dbl>
1 0 1100 42.2
2 Yes 1462 56.0
3 <NA> 47 1.80
Random Forest (randomForestSRC)
#install.packages("randomForestSRC)
rfdata <- qol |>
select(`Dentist Check-up`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`) |>
as.data.frame()
imb <- imbalanced(`Dentist Check-up` ~ .,importance=T,data=rfdata,
perf.type = "gmean",splitrule="gini")
print(imb) Sample size: 2175
Frequency of class labels: 896, 1279
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 450.7223
No. of variables tried at each split: 6
Total no. of variables: 31
Resampling used to grow trees: swor
Resample size used to grow trees: 1375
Analysis: RFQ
Family: class
Splitting rule: gini *random*
Number of random split points: 10
Imbalanced ratio: 1.4275
(OOB) Brier score: 0.21277143
(OOB) Normalized Brier score: 0.85108573
(OOB) AUC: 0.70613595
(OOB) PR-AUC: 0.60028161
(OOB) G-mean: 0.65337547
(OOB) Requested performance error: 0.34662453
Confusion matrix:
predicted
observed 0 Yes class.error
0 610 286 0.3192
Yes 477 802 0.3729
(OOB) Misclassification rate: 0.3508046
plot(imb,plots.one.page = F)

all 0 Yes
EnglishSpeak 0.0192 NA NA
Age 0.0122 NA NA
Gender 0.0032 NA NA
Helpful Friends 0.0031 NA NA
Religious Importance 0.0023 NA NA
Ethnicity 0.0016 NA NA
Income_median 0.0014 NA NA
Helpful Family 0.0011 NA NA
EnglishDiff 0.0008 NA NA
Feel Close 0.0007 NA NA
Close Friends 0.0007 NA NA
See Friends 0.0005 NA NA
Close Family 0.0005 NA NA
Employment 0.0001 NA NA
Community Trust -0.0002 NA NA
Spend Time Together -0.0002 NA NA
Trust -0.0005 NA NA
Religion -0.0005 NA NA
See Family -0.0013 NA NA
Get Along -0.0017 NA NA
Religious Attendance -0.0023 NA NA
Loyalty -0.0028 NA NA
Family Pride -0.0036 NA NA
Successful Family -0.0044 NA NA
Togetherness -0.0045 NA NA
Expression -0.0049 NA NA
get.imbalanced.performance(imb) n.majority n.minority iratio threshold sens spec
1279.0000000 896.0000000 1.4274554 0.4119540 0.6808036 0.6270524
prec npv misclass brier brier.norm auc
0.5611776 0.7371324 0.3508046 0.2127714 0.8510857 0.7061359
F1 F1mod pr.auc.rand pr.auc F1gmean F1modgmean
0.6152295 0.6449334 0.4119540 0.6002816 0.6343025 0.6491544
gmean
0.6533755
var_importance <- imb$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
# Create ggplot for variable importance
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "#F8766D") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_minimal()
plot(importance_plot)
Urgent Care
ps(`Urgentcare`)# A tibble: 3 × 3
Urgentcare n pct
<fct> <int> <dbl>
1 0 2112 81.0
2 Yes 440 16.9
3 <NA> 57 2.18
Random Forest (randomForestSRC)
#install.packages("randomForestSRC)
rfdata <- qol |>
select(`Urgentcare`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`) |>
as.data.frame()
imb <- imbalanced(`Urgentcare` ~ .,importance=T,data=rfdata,
perf.type = "gmean",splitrule="gini")
print(imb) Sample size: 2167
Frequency of class labels: 1808, 359
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 326.4887
No. of variables tried at each split: 6
Total no. of variables: 31
Resampling used to grow trees: swor
Resample size used to grow trees: 1370
Analysis: RFQ
Family: class
Splitting rule: gini *random*
Number of random split points: 10
Imbalanced ratio: 5.0362
(OOB) Brier score: 0.1369538
(OOB) Normalized Brier score: 0.54781519
(OOB) AUC: 0.59534612
(OOB) PR-AUC: 0.23587161
(OOB) G-mean: 0.5668512
(OOB) Requested performance error: 0.4331488
Confusion matrix:
predicted
observed 0 Yes class.error
0 880 928 0.5133
Yes 122 237 0.3398
(OOB) Misclassification rate: 0.4845408
plot(imb,plots.one.page = F)

all 0 Yes
Close Family 0.0370 NA NA
Ethnicity 0.0241 NA NA
Age 0.0218 NA NA
Close Friends 0.0195 NA NA
Loyalty 0.0195 NA NA
Trust 0.0190 NA NA
EnglishDiff 0.0187 NA NA
Helpful Family 0.0186 NA NA
Religion 0.0164 NA NA
Family Respect 0.0162 NA NA
EnglishSpeak 0.0154 NA NA
Religious Attendance 0.0152 NA NA
Spend Time Together 0.0146 NA NA
Feel Close 0.0143 NA NA
Community Trust 0.0132 NA NA
Family Pride 0.0125 NA NA
See Friends 0.0124 NA NA
Helpful Friends 0.0123 NA NA
Helpful Community 0.0120 NA NA
Togetherness 0.0113 NA NA
Close-knit Community 0.0107 NA NA
Income_median 0.0107 NA NA
Religious Importance 0.0089 NA NA
Expression 0.0087 NA NA
Similar Values 0.0085 NA NA
See Family 0.0083 NA NA
get.imbalanced.performance(imb) n.majority n.minority iratio threshold sens spec
1808.0000000 359.0000000 5.0362117 0.1656668 0.6601671 0.4867257
prec npv misclass brier brier.norm auc
0.2034335 0.8782435 0.4845408 0.1369538 0.5478152 0.5953461
F1 F1mod pr.auc.rand pr.auc F1gmean F1modgmean
0.3110236 0.4156465 0.1656668 0.2358716 0.4389374 0.4912489
gmean
0.5668512
var_importance <- imb$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
# Create ggplot for variable importance
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "#F8766D") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_minimal()
plot(importance_plot)
Folk Medicine
ps(`Folkmedicine`)# A tibble: 3 × 3
Folkmedicine n pct
<fct> <int> <dbl>
1 0 2189 83.9
2 Yes 348 13.3
3 <NA> 72 2.76
Random Forest (randomForestSRC)
#install.packages("randomForestSRC)
rfdata <- qol |>
select(`Folkmedicine`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`) |>
as.data.frame()
imb <- imbalanced(`Folkmedicine` ~ .,importance=T,data=rfdata,
perf.type = "gmean",splitrule="gini")
print(imb) Sample size: 2152
Frequency of class labels: 1866, 286
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 271.6277
No. of variables tried at each split: 6
Total no. of variables: 31
Resampling used to grow trees: swor
Resample size used to grow trees: 1360
Analysis: RFQ
Family: class
Splitting rule: gini *random*
Number of random split points: 10
Imbalanced ratio: 6.5245
(OOB) Brier score: 0.11210216
(OOB) Normalized Brier score: 0.44840865
(OOB) AUC: 0.66744617
(OOB) PR-AUC: 0.21621988
(OOB) G-mean: 0.62839779
(OOB) Requested performance error: 0.37160221
Confusion matrix:
predicted
observed 0 Yes class.error
0 1028 838 0.4491
Yes 81 205 0.2832
(OOB) Misclassification rate: 0.4270446
plot(imb,plots.one.page = F)

all 0 Yes
Age 0.0504 NA NA
Ethnicity 0.0468 NA NA
EnglishSpeak 0.0196 NA NA
EnglishDiff 0.0178 NA NA
Employment 0.0132 NA NA
Helpful Friends 0.0117 NA NA
Family Pride 0.0111 NA NA
Trust 0.0095 NA NA
Feel Close 0.0079 NA NA
Community Trust 0.0068 NA NA
Expression 0.0066 NA NA
Helpful Community 0.0061 NA NA
Income_median 0.0061 NA NA
Close Friends 0.0049 NA NA
Loyalty 0.0047 NA NA
Close-knit Community 0.0046 NA NA
Togetherness 0.0046 NA NA
Community Shares Values 0.0044 NA NA
See Friends 0.0043 NA NA
Religious Attendance 0.0040 NA NA
Successful Family 0.0037 NA NA
Similar Values 0.0035 NA NA
Religion 0.0032 NA NA
Get Along 0.0031 NA NA
Family Respect 0.0026 NA NA
Religious Importance 0.0022 NA NA
get.imbalanced.performance(imb) n.majority n.minority iratio threshold sens spec
1866.0000000 286.0000000 6.5244755 0.1328996 0.7167832 0.5509110
prec npv misclass brier brier.norm auc
0.1965484 0.9269612 0.4270446 0.1121022 0.4484087 0.6674462
F1 F1mod pr.auc.rand pr.auc F1gmean F1modgmean
0.3085026 0.4265804 0.1328996 0.2162199 0.4684502 0.5274891
gmean
0.6283978
var_importance <- imb$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
# Create ggplot for variable importance
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "#F8766D") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_minimal()
plot(importance_plot)